# Computations
import numpy as np
import pandas as pd
import pickle
# preprocessing
from sklearn import preprocessing
import re
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this study, we analyze HR data available from kaggle.com. This data is fictional and it is created by IBM data scientists.
Categorical Parameters:
| 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|
| Education | Below College | College | Bachelor | Master | Doctor |
| Environment Satisfaction | Low | Medium | High | Very High | |
| Job Involvement | Low | Medium | High | Very High | |
| Job Satisfaction | Low | Medium | High | Very High | |
| Performance Rating | Low | Good | Excellent | Outstanding | |
| Relationship Satisfaction | Low | Medium | High | Very High | |
| WorkLife Balance | Bad | Good | Better | Best |
This can be encoded as follows,
Categorical_Dict = {'Education': {1:'Below College', 2:'College',3:'Bachelor', 4: 'Master', 5:'Doctor'},
'Environment Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Job Involvement': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Job Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Performance Rating': {1:'Low', 2:'Good', 3:'Excellent', 4:'Outstanding'},
'Relationship Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Work Life Balance': {1:'Bad', 2:'Good', 3:'Better', 4:'Best'}}
Path = 'Data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx'
Data = pd.read_excel(Path)
Temp = [re.sub(r"(\w)([A-Z])", r"\1 \2", x) for x in Data.columns]
Temp = [x.replace(' Curr ', ' Current ').replace('18',' 18').replace('Num ','Number Of ') for x in Temp]
Data.columns = Temp
del Temp
Data['Business Travel'] = Data['Business Travel'].str.replace('_',' ')
display(Data.head(8).style.hide_index())
Target = 'Attrition'
Featured_Columns = list(set(Data.columns) - {Target, 'Employee Number'})
| Age | Attrition | Business Travel | Daily Rate | Department | Distance From Home | Education | Education Field | Employee Count | Employee Number | Environment Satisfaction | Gender | Hourly Rate | Job Involvement | Job Level | Job Role | Job Satisfaction | Marital Status | Monthly Income | Monthly Rate | Number Of Companies Worked | Over 18 | Over Time | Percent Salary Hike | Performance Rating | Relationship Satisfaction | Standard Hours | Stock Option Level | Total Working Years | Training Times Last Year | Work Life Balance | Years At Company | Years In Current Role | Years Since Last Promotion | Years With Current Manager |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41 | Yes | Travel Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 49 | No | Travel Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 37 | Yes | Travel Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 33 | No | Travel Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 27 | No | Travel Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| 32 | No | Travel Frequently | 1005 | Research & Development | 2 | 2 | Life Sciences | 1 | 8 | 4 | Male | 79 | 3 | 1 | Laboratory Technician | 4 | Single | 3068 | 11864 | 0 | Y | No | 13 | 3 | 3 | 80 | 0 | 8 | 2 | 2 | 7 | 7 | 3 | 6 |
| 59 | No | Travel Rarely | 1324 | Research & Development | 3 | 3 | Medical | 1 | 10 | 3 | Female | 81 | 4 | 1 | Laboratory Technician | 1 | Married | 2670 | 9964 | 4 | Y | Yes | 20 | 4 | 1 | 80 | 3 | 12 | 3 | 2 | 1 | 0 | 0 | 0 |
| 30 | No | Travel Rarely | 1358 | Research & Development | 24 | 1 | Life Sciences | 1 | 11 | 4 | Male | 67 | 3 | 1 | Laboratory Technician | 3 | Divorced | 2693 | 13335 | 1 | Y | No | 22 | 4 | 2 | 80 | 1 | 1 | 2 | 3 | 1 | 0 | 0 | 0 |
First off, let's take a look at the dataset
def Data_Plot(Inp, Title = None, W = None):
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type',
text = 'Percentage',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
if not W == None:
fig.update_layout(width = W)
fig.update_traces(texttemplate= 10*' ' + '%%{text}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if not Title == None:
fig.update_layout(title={'text': '<b>' + Title + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
return data_info
_ = Data_Plot(Data, Title = 'IBM HR Analytics Employee Attrition and Performance Dataset')
Moreover,
def Distinct_Observations(Inp, Target = Target, Featured_Columns = None, YL = None):
if Featured_Columns == None:
Featured_Columns = list(set(Data.columns) - {Target})
Temp = Inp[Featured_Columns].nunique()
fig = go.Figure([go.Bar(x=Temp.index, y=Temp.values)])
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1, opacity=1, showlegend = False)
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= 450, width= 980,
title={'text': '<b>' + 'Distinct Observations in Each Column' + '<b>', 'x':0.5,
'y': 0.92, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
if not YL == None:
fig.update_yaxes(range =[0, YL])
fig.show()
return Temp
_ = Distinct_Observations(Inp = Data, Featured_Columns = Featured_Columns, YL = 1500)
We also need to convert categorical data to numeric data.
def dtypes_group(Inp):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Features','Count'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Features'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
Out.loc[Out.index == c, 'Count'] = len(Temp.loc[Temp['Data Type'] == c].index.tolist())
Out = Out.reset_index(drop = False).rename(columns = {'index':'Data Type'})
Out['Data Type'] = Out['Data Type'].astype(str)
return Out
def dtype_sep(Inp):
Temp = Inp.dtypes.reset_index(drop = False)
Temp.columns = ['Features', 'Data Type']
Temp['Data Type'] = Temp['Data Type'].astype(str)
# Numeric_Columns
Numeric_Columns = Temp.loc[Temp['Data Type'].isin(['int64', 'int32', 'float64', 'float32']),'Features'].tolist()
# Categorical_Columns
Categorical_Columns = Temp.loc[Temp['Data Type'] == 'object','Features'].tolist()
return Numeric_Columns, Categorical_Columns
Numeric_Columns, Categorical_Columns = dtype_sep(Data)
display(dtypes_group(Data).style.hide_index())
| Data Type | Features | Count |
|---|---|---|
| int64 | ['Age', 'Years In Current Role', 'Years At Company', 'Work Life Balance', 'Training Times Last Year', 'Total Working Years', 'Stock Option Level', 'Standard Hours', 'Relationship Satisfaction', 'Performance Rating', 'Percent Salary Hike', 'Number Of Companies Worked', 'Monthly Rate', 'Monthly Income', 'Years Since Last Promotion', 'Job Satisfaction', 'Years With Current Manager', 'Job Level', 'Daily Rate', 'Distance From Home', 'Education', 'Employee Number', 'Environment Satisfaction', 'Employee Count', 'Hourly Rate', 'Job Involvement'] | 26 |
| object | ['Over 18', 'Job Role', 'Education Field', 'Gender', 'Department', 'Business Travel', 'Attrition', 'Over Time', 'Marital Status'] | 9 |
We can use LabelEncoder for converting categorical to numeric using. Therefore,
# A copy of the dataset
df = Data.copy()
N = len(Categorical_Columns)
# Progressbar
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval= N, widgets=[progressbar.Bar('=', '|', '|'), progressbar.Percentage()])
#--------------- the loop ----------------------
Progress_Bar.start()
for i in range(N):
le = preprocessing.LabelEncoder()
le.fit(list(df[Categorical_Columns[i]]))
df[Categorical_Columns[i]] = le.transform(df[Categorical_Columns[i]])
del le
Progress_Bar.update(Counter)
Counter+=1
Progress_Bar.finish()
#--------------- End of the loop ---------------
# Finally, converting values of df back to integers.
df = df.astype(int)
display(dtypes_group(df).style.hide_index())
|=========================================================================|100%
| Data Type | Features | Count |
|---|---|---|
| int32 | ['Age', 'Monthly Rate', 'Number Of Companies Worked', 'Over 18', 'Over Time', 'Percent Salary Hike', 'Performance Rating', 'Monthly Income', 'Relationship Satisfaction', 'Stock Option Level', 'Total Working Years', 'Training Times Last Year', 'Work Life Balance', 'Years At Company', 'Years In Current Role', 'Standard Hours', 'Years Since Last Promotion', 'Marital Status', 'Job Role', 'Attrition', 'Business Travel', 'Daily Rate', 'Department', 'Distance From Home', 'Education', 'Job Satisfaction', 'Education Field', 'Employee Number', 'Environment Satisfaction', 'Gender', 'Hourly Rate', 'Job Involvement', 'Job Level', 'Employee Count', 'Years With Current Manager'] | 35 |
df.to_csv (Path.split(".")[0]+'.csv', index = None, header=True)
Moreover,
def FeatBins(Inp, Bins, replace = True):
Bins = [int(x) for x in Bins]
Out = pd.cut(Inp, bins = pd.IntervalIndex.from_tuples([(x, y) for x, y in zip(Bins[:-1],Bins[1:])]))
Temp = np.sort(Out.astype('str').unique())
Dict = dict(zip(Temp, np.arange(len(Temp))))
if replace:
Out = Out.astype('str').replace(Dict)
else:
Out = Out.astype('str')
try:
Out = Out.str.replace(pat = '(', repl = '[').str.replace(pat = '-1', repl = '0')
except:
pass
return Out
Bin_Dict = {'Age': [15, 25, 35, 50, 60, 80],
'Daily Rate': [100, 300, 600, 1000, 1500],
'Distance From Home': [0, 5, 10, 20, 30],
'Hourly Rate': [25, 50, 75, 101],
'Monthly Income': [1e3, 3e3, 7e3, 1e4, 2e4],
'Monthly Rate': [2e3, 1e4, 2e4, 3e4],
'Number Of Companies Worked': [-1, 2, 4, 6, 10],
'Percent Salary Hike': list(np.arange(10, 27, 4)),
'Total Working Years': [-1, 10, 20, 30, 41],
'Years At Company': [-1, 10, 20, 30, 41],
'Years In Current Role': [-1, 4, 8, 12, 19],
'Years Since Last Promotion': [-1, 5, 10, 16],
'Years With Current Manager': [-1, 4, 8, 12, 18]}
for Feat in Bin_Dict.keys():
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bin_Dict[Feat])
_ = Distinct_Observations(Inp = df, Featured_Columns = Featured_Columns, YL = 10)
First, we remove features that have zero variance as these features don't add anything to our modeling.
# var
Temp = df[Featured_Columns].var().sort_values(ascending = False)
Temp = Temp.loc[Temp.round(16) ==0].index.tolist()
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Features with variance zero' + Style.RESET_ALL + ':' + '%s' % ', '.join(Temp))
df = df.drop(columns = Temp)
del Temp
Features with variance zero:Employee Count, Over 18, Standard Hours
Moreover, high variance for some features can hurt our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
Aditional_Columns = [Target, 'Employee Number']
X = df.drop(columns = Aditional_Columns)
def Feature_Normalize(X, PD):
def List_Break(mylist, n = PD['word_break']):
Out = []
for x in mylist:
y = x.split()
if len(y)> n:
z = ' '.join(y[:n])
sep = np.arange(0, len(y), n)[1:]
for n in sep:
z = z + '\n'+ ' '.join(y[n:])
else:
z = ' '.join(y)
Out.append(z)
return Out
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns = X.columns)
fig, ax = plt.subplots(2, 1, figsize = PD['figsize'])
ax = ax.ravel()
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", X.shape[1])]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=10, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": PD['annot_text_size']},
cbar_kws=kws)
if not PD['word_break'] == None:
mylist = List_Break(Temp.T.index.tolist())
_ = ax[i].xaxis.set_ticklabels(mylist)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], weight='bold', fontsize = 14)
_ = ax[i].set_aspect(1)
del Temp
plt.subplots_adjust(hspace=PD['hspace'])
Out = pd.DataFrame(data = X_std, columns = X.columns.tolist())
return Out
PD = dict(figsize = (20, 7), hspace = 0.2, annot_text_size = 10, word_break = 2)
df[X.columns.tolist()] = Feature_Normalize(X, PD)
def Feature_Corr(df, PD, Target = Target):
def List_Break(mylist, n = PD['word_break']):
Out = []
for x in mylist:
y = x.split()
if len(y)> n:
z = ' '.join(y[:n])
sep = np.arange(0, len(y), n)[1:]
for n in sep:
z = z + '\n'+ ' '.join(y[n:])
else:
z = ' '.join(y)
Out.append(z)
return Out
fig, ax = plt.subplots(1, 1, figsize = PD['figsize'])
CM = df.corr().round(2)
CM = CM.loc[(CM.index == Target)].drop(columns = Target).T.sort_values(by = Target).T
kws = dict(label='Feature\nVariance', aspect=10, shrink= .3)
_ = sns.heatmap(CM, ax=ax, annot=True, square=True, cmap = PD['Colormap'], linewidths = 0.8, vmin=0,
vmax=CM.max(axis =1)[0], annot_kws={"size": PD['annot_text_size']}, cbar_kws=kws)
if not PD['word_break'] == None:
mylist = List_Break(CM.T.index.tolist())
_ = ax.xaxis.set_ticklabels(mylist)
_ = ax.set_yticklabels('')
_ = ax.set_aspect(1)
PD.update(Colormap = 'Greens')
Feature_Corr(df, PD)
df.to_csv (Path.split(".")[0]+'_STD.csv', index = None, header=True)
with open(Path.split(".")[0] + '_Categorical_Dict.pkl', 'wb') as fp:
pickle.dump(Categorical_Dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
with open(Path.split(".")[0] + '_Bin_Dict.pkl', 'wb') as fp:
pickle.dump(Bin_Dict, fp, protocol=pickle.HIGHEST_PROTOCOL)